/*
 *  Copyright 2008-2013 NVIDIA Corporation
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

#pragma once

#include <thrust/detail/config.h>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
#  pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
#  pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
#  pragma system_header
#endif // no system header
#include <thrust/detail/copy_if.h>
#include <thrust/detail/internal_functional.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/type_traits.h>
#include <thrust/distance.h>
#include <thrust/functional.h>
#include <thrust/iterator/iterator_traits.h>
#include <thrust/scan.h>
#include <thrust/scatter.h>
#include <thrust/system/detail/generic/copy_if.h>
#include <thrust/transform.h>

#include <cuda/std/limits>

THRUST_NAMESPACE_BEGIN
namespace system
{
namespace detail
{
namespace generic
{
namespace detail
{

template <typename IndexType,
          typename DerivedPolicy,
          typename InputIterator1,
          typename InputIterator2,
          typename OutputIterator,
          typename Predicate>
_CCCL_HOST_DEVICE OutputIterator copy_if(
  thrust::execution_policy<DerivedPolicy>& exec,
  InputIterator1 first,
  InputIterator1 last,
  InputIterator2 stencil,
  OutputIterator result,
  Predicate pred)
{
  const auto n = static_cast<IndexType>(::cuda::std::distance(first, last));

  // compute {0,1} predicates
  thrust::detail::temporary_array<IndexType, DerivedPolicy> predicates(exec, n);
  thrust::transform(
    exec, stencil, stencil + n, predicates.begin(), thrust::detail::predicate_to_integral<Predicate, IndexType>{pred});

  // scan {0,1} predicates
  thrust::detail::temporary_array<IndexType, DerivedPolicy> scatter_indices(exec, n);
  thrust::exclusive_scan(
    exec,
    predicates.begin(),
    predicates.end(),
    scatter_indices.begin(),
    static_cast<IndexType>(0),
    ::cuda::std::plus<IndexType>());

  // scatter the true elements
  thrust::scatter_if(exec, first, last, scatter_indices.begin(), predicates.begin(), result);

  // find the end of the new sequence
  IndexType output_size = scatter_indices[n - 1] + predicates[n - 1];

  return result + output_size;
}

} // end namespace detail

template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate>
_CCCL_HOST_DEVICE OutputIterator copy_if(
  thrust::execution_policy<DerivedPolicy>& exec,
  InputIterator first,
  InputIterator last,
  OutputIterator result,
  Predicate pred)
{
  // XXX it's potentially expensive to send [first,last) twice
  //     we should probably specialize this case for POD
  //     since we can safely keep the input in a temporary instead
  //     of doing two loads
  return thrust::copy_if(exec, first, last, first, result, pred);
} // end copy_if()

template <typename DerivedPolicy,
          typename InputIterator1,
          typename InputIterator2,
          typename OutputIterator,
          typename Predicate>
_CCCL_HOST_DEVICE OutputIterator copy_if(
  thrust::execution_policy<DerivedPolicy>& exec,
  InputIterator1 first,
  InputIterator1 last,
  InputIterator2 stencil,
  OutputIterator result,
  Predicate pred)
{
  using difference_type = thrust::detail::it_difference_t<InputIterator1>;

  // empty sequence
  if (first == last)
  {
    return result;
  }

  difference_type n = ::cuda::std::distance(first, last);

  // create an unsigned version of n (we know n is positive from the comparison above)
  // to avoid a warning in the compare below
  ::cuda::std::make_unsigned_t<difference_type> unsigned_n(n);

  // use 32-bit indices when possible (almost always)
  if (sizeof(difference_type) > sizeof(unsigned int) && unsigned_n > ::cuda::std::numeric_limits<unsigned int>::max())
  {
    result = detail::copy_if<difference_type>(exec, first, last, stencil, result, pred);
  } // end if
  else
  {
    result = detail::copy_if<unsigned int>(exec, first, last, stencil, result, pred);
  } // end else

  return result;
} // end copy_if()

} // end namespace generic
} // end namespace detail
} // end namespace system
THRUST_NAMESPACE_END
